本文共 5330 字,大约阅读时间需要 17 分钟。
点击右键审查元素
查看关键的标签对和网页的编码
注意:目前是定向爬虫,爬取的数据策略可能随着网站更新而变化
# coding=utf-8#!/usr/bin/pythonfrom baike_spider import url_manager, html_download, html_parser, html_outputerclass SpiderMain(object): def __init__(self): # URL 管理器 self.urls = url_manager.UrlManager() # URL 下载器 self.downloader = html_download.HtmlDownload() # URL 解析器 self.parser = html_parser.HtmlParser() # URL 输出器 self.outputer = html_outputer.HtmlOutputer() # 爬虫的调度程序 def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: # 获取待爬取的 URL new_url = self.urls.get_new_url() print("craw %d : %s" % (count, new_url)) html_content = self.downloader.downloader(new_url) new_urls, new_data = self.parser.parse(new_url, html_content) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) # 只爬取 1000 条的数据 if(count == 1000): break count = count + 1 except: print("craw failed") self.outputer.output_html()if __name__ == '__main__': # 入口 URL:百度百科的 Python 相关的百度词条 root_url = "https://baike.baidu.com/item/Python/407313" obj_spider = SpiderMain() # 启动爬虫 obj_spider.craw(root_url)
# coding=utf-8#!/usr/bin/python# URL 管理器class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) #print("add_new_url", url) def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): # 取出并且移除 new_url = self.new_urls.pop() self.old_urls.add(new_url) #print("get_new_url", new_url) return new_url
# coding=utf-8#!/usr/bin/python'''HTML 下载器'''import urllib.requestclass HtmlDownload(object): def downloader(self, url): if url is None: return None response = urllib.request.urlopen(url) if response.getcode() != 200: print("response.getcode() =", response.getcode()) return None return response.read()
# coding=utf-8#!/usr/bin/python'''Html 解析器传入 url 解析新的url列表(title 和 summary)'''from bs4 import BeautifulSoupimport refrom urllib.parse import urlparsefrom urllib.parse import urljoinimport urllib.parseclass HtmlParser(object): def _get_new_urls(self, page_url, soup): new_urls = set() # 获取所有的链接,例如a 标签 '''Python
(计算机程序设计语言)
编辑 锁定 ''' title_node = soup.find( 'dd', class_='lemmaWgt-lemmaTitle-title').find("h1") res_data['title'] = title_node.get_text() '''''' summary_node = soup.find('div', class_="lemma-summary") res_data['summary'] = summary_node.get_text() return res_data def parse(self, page_url, html_content): if page_url is None or html_content is None: return #print('parse html_content = ', html_content) soup = BeautifulSoup( html_content, 'html.parser', from_encoding='utf-8') new_urls = self._get_new_urls(page_url, soup) new_data = self._get_new_data(page_url, soup) return new_urls, new_dataPython [1] (英国发音:/ˈpaɪθən/ 美国发音:/ˈpaɪθɑːn/), 是一种面向对象的解释型 计算机程序设计语言,由荷兰人 Guido van Rossum于1989年发明,第一个公开发行版发行于1991年。Python是纯粹的 自由软件, 源代码和 解释器CPython遵循 GPL( GNU General Public License)协议。Python语法简洁清晰,特色之一是强制用空白符(white space)作为语句缩进。
# coding=utf-8#!/usr/bin/python'''Html 输出器'''class HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return #print("collect_data = ", data) self.datas.append(data) def output_html(self): fout = open('output.html', 'w', encoding='utf-8') fout.write('') fout.write('') fout.write('
%s | " % data['url']) fout.write("%s | " % data['title']) fout.write("%s | " % data['summary']) fout.write('
爬取1000个百科相关网页,并解析title和summary
file:///D:/EclipseProject/GrawDemo/baike_spider/output.html